sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.2  magrittr_1.5    tools_3.5.2     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11 
##  [9] knitr_1.21      stringr_1.3.1   xfun_0.4        digest_0.6.18  
## [13] evaluate_0.12

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## input attrition file
setwd("~/Documents/SMU/Doing DS/Unit 14")

emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
#emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE, namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)

head(emp)
##   Age Attrition Attr_Ind    BusinessTravel BT1 BT2 DailyRate
## 1  41       Yes        1     Travel_Rarely   1   0      1102
## 2  49        No        0 Travel_Frequently   0   1       279
## 3  37       Yes        1     Travel_Rarely   1   0      1373
## 4  33        No        0 Travel_Frequently   0   1      1392
## 5  27        No        0     Travel_Rarely   1   0       591
## 6  32        No        0 Travel_Frequently   0   1      1005
##               Department DistanceFromHome Education EducationField
## 1                  Sales                1         2  Life Sciences
## 2 Research & Development                8         1  Life Sciences
## 3 Research & Development                2         2          Other
## 4 Research & Development                3         4  Life Sciences
## 5 Research & Development                2         1        Medical
## 6 Research & Development                2         2  Life Sciences
##   EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate
## 1             1              1                       2 Female         94
## 2             1              2                       3   Male         61
## 3             1              4                       4   Male         92
## 4             1              5                       4 Female         56
## 5             1              7                       1   Male         40
## 6             1              8                       4   Male         79
##   JobInvolvement JobLevel               JobRole JobSatisfaction
## 1              3        2       Sales Executive               4
## 2              2        2    Research Scientist               2
## 3              2        1 Laboratory Technician               3
## 4              3        1    Research Scientist               3
## 5              3        1 Laboratory Technician               2
## 6              3        1 Laboratory Technician               4
##   MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18
## 1        Single          5993       19479                  8      Y
## 2       Married          5130       24907                  1      Y
## 3        Single          2090        2396                  6      Y
## 4       Married          2909       23159                  1      Y
## 5       Married          3468       16632                  9      Y
## 6        Single          3068       11864                  0      Y
##   OverTime OT_ind PercentSalaryHike PerformanceRating
## 1      Yes      1                11                 3
## 2       No      0                23                 4
## 3      Yes      1                15                 3
## 4      Yes      1                11                 3
## 5       No      0                12                 3
## 6       No      0                13                 3
##   RelationshipSatisfaction StandardHours StockOptionLevel
## 1                        1            80                0
## 2                        4            80                1
## 3                        2            80                0
## 4                        3            80                0
## 5                        4            80                1
## 6                        3            80                0
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     0               1              6
## 2                10                     3               3             10
## 3                 7                     3               3              0
## 4                 8                     3               3              8
## 5                 6                     3               3              2
## 6                 8                     2               2              7
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  4                       0                    5
## 2                  7                       1                    7
## 3                  0                       0                    0
## 4                  7                       3                    0
## 5                  2                       2                    2
## 6                  7                       3                    6
#Missing values check

#dimension
dim(emp)
## [1] 1470   39
#summary
summary(emp$MonthlyIncome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1009    2911    4919    6503    8379   19999
summary(emp$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   30.00   36.00   36.92   43.00   60.00
summary(as.numeric(emp$YearsAtCompany))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   5.000   7.008   9.000  40.000
summary(emp$NumCompaniesWorked)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.693   4.000   9.000
summary(emp$DistanceFromHome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   7.000   9.193  14.000  29.000
summary(emp$YearsInCurrentRole)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.229   7.000  18.000
summary(emp$JobSatisfaction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.729   4.000   4.000
hist(emp$MonthlyIncome)

hist(emp$Age)

hist(as.numeric(emp$YearsAtCompany))

hist(emp$NumCompaniesWorked)

hist(emp$DistanceFromHome)

hist(emp$YearsInCurrentRole)

freq(emp$Gender)
##          n  % val%
## Female 588 40   40
## Male   882 60   60
freq(emp$Education)
##     n    % val%
## 1 170 11.6 11.6
## 2 282 19.2 19.2
## 3 572 38.9 38.9
## 4 398 27.1 27.1
## 5  48  3.3  3.3
freq(emp$JobRole)
##                             n    % val%
## Healthcare Representative 131  8.9  8.9
## Human Resources            52  3.5  3.5
## Laboratory Technician     259 17.6 17.6
## Manager                   102  6.9  6.9
## Manufacturing Director    145  9.9  9.9
## Research Director          80  5.4  5.4
## Research Scientist        292 19.9 19.9
## Sales Executive           326 22.2 22.2
## Sales Representative       83  5.6  5.6
## input age file
fit <- lm(Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + OT_ind + BT1 + BT2 + Attr_Ind, data=emp)


summary(fit)
## 
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.3916  -5.4903  -0.9349   4.2522  26.7550 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.9418089  0.7173773  38.950  < 2e-16 ***
## MonthlyIncome       0.0007144  0.0000506  14.119  < 2e-16 ***
## NumCompaniesWorked  0.9776191  0.0818675  11.941  < 2e-16 ***
## YearsAtCompany      0.2082472  0.0384393   5.418 7.06e-08 ***
## OT_ind              1.2025377  0.4501058   2.672  0.00763 ** 
## BT1                 0.3568153  0.6581930   0.542  0.58782    
## BT2                 0.4648894  0.7681020   0.605  0.54511    
## Attr_Ind           -2.7124446  0.5644751  -4.805 1.70e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3227 
## F-statistic:   101 on 7 and 1462 DF,  p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs
qqPlot(fit, main="QQ Plot")  #qq plot for studentized resid 

## [1]  939 1355
leveragePlots(fit) # leverage plots

# Normality of Residuals

# distribution of studentized residuals
library(MASS)
sresid <- studres(fit) 
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fit)
## 
## Suggested power transformation:  0.7645897
# Evaluate Collinearity
vif(fit) # variance inflation factors 
##      MonthlyIncome NumCompaniesWorked     YearsAtCompany 
##           1.475043           1.086961           1.441393 
##             OT_ind                BT1                BT2 
##           1.069121           2.322108           2.346506 
##           Attr_Ind 
##           1.120636
sqrt(vif(fit)) > 2 # problem?
##      MonthlyIncome NumCompaniesWorked     YearsAtCompany 
##              FALSE              FALSE              FALSE 
##             OT_ind                BT1                BT2 
##              FALSE              FALSE              FALSE 
##           Attr_Ind 
##              FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fit)

## 
## Suggested power transformation:  0.7645897
# Evaluate Nonlinearity
# component + residual plot 
crPlots(fit)

# Ceres plots 
#ceresPlots(fit)

# Test for Autocorrelated Errors
durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01150988       2.02301   0.632
##  Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)
## 
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.3916  -5.4903  -0.9349   4.2522  26.7550 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.9418089  0.7173773  38.950  < 2e-16 ***
## MonthlyIncome       0.0007144  0.0000506  14.119  < 2e-16 ***
## NumCompaniesWorked  0.9776191  0.0818675  11.941  < 2e-16 ***
## YearsAtCompany      0.2082472  0.0384393   5.418 7.06e-08 ***
## OT_ind              1.2025377  0.4501058   2.672  0.00763 ** 
## BT1                 0.3568153  0.6581930   0.542  0.58782    
## BT2                 0.4648894  0.7681020   0.605  0.54511    
## Attr_Ind           -2.7124446  0.5644751  -4.805 1.70e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3227 
## F-statistic:   101 on 7 and 1462 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit) 
## 
##                      Value   p-value                   Decision
## Global Stat        147.935 0.0000000 Assumptions NOT satisfied!
## Skewness           120.341 0.0000000 Assumptions NOT satisfied!
## Kurtosis             8.124 0.0043692 Assumptions NOT satisfied!
## Link Function       17.050 0.0000364 Assumptions NOT satisfied!
## Heteroscedasticity   2.419 0.1198383    Assumptions acceptable.
## input Attrition 
fitAttr <- lm( MonthlyIncome ~ Age  + NumCompaniesWorked + YearsAtCompany + OT_ind + BT1 + BT2 + Attr_Ind, data=emp)



summary(fitAttr)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9913.0 -2282.8  -478.7  1590.7 13260.2 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2695.11     491.47  -5.484  4.9e-08 ***
## Age                  167.94      11.90  14.119  < 2e-16 ***
## NumCompaniesWorked   195.08      41.27   4.727  2.5e-06 ***
## YearsAtCompany       320.99      16.85  19.053  < 2e-16 ***
## OT_ind               193.53     218.71   0.885  0.37635    
## BT1                  381.18     319.00   1.195  0.23230    
## BT2                  110.22     372.45   0.296  0.76733    
## Attr_Ind            -769.36     275.10  -2.797  0.00523 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3645 on 1462 degrees of freedom
## Multiple R-squared:  0.4034, Adjusted R-squared:  0.4005 
## F-statistic: 141.2 on 7 and 1462 DF,  p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs

qqPlot(fitAttr, main="QQ Plot")  #qq plot for studentized resid 

## [1] 1130 1438
leveragePlots(fitAttr) # leverage plots

# Normality of Residuals

# distribution of studentized residuals
library(MASS)
sresid <- studres(fitAttr) 
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 237.7393, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fitAttr)
## Warning in spreadLevelPlot.lm(fitAttr): 
## 2 negative fitted values removed
## 
## Suggested power transformation:  0.07475701
# Evaluate Collinearity
vif(fitAttr) # variance inflation factors 
##                Age NumCompaniesWorked     YearsAtCompany 
##           1.305565           1.175022           1.177871 
##             OT_ind                BT1                BT2 
##           1.073766           2.320309           2.346954 
##           Attr_Ind 
##           1.132278
sqrt(vif(fitAttr)) > 2 # problem?
##                Age NumCompaniesWorked     YearsAtCompany 
##              FALSE              FALSE              FALSE 
##             OT_ind                BT1                BT2 
##              FALSE              FALSE              FALSE 
##           Attr_Ind 
##              FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 237.7393, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fitAttr)
## Warning in spreadLevelPlot.lm(fitAttr): 
## 2 negative fitted values removed

## 
## Suggested power transformation:  0.07475701
# Evaluate Nonlinearity
# component + residual plot 
crPlots(fitAttr)

# Ceres plots 
#ceresPlots(fitAttr)

# Test for Autocorrelated Errors
durbinWatsonTest(fitAttr)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01872124      2.037305   0.458
##  Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fitAttr) 
summary(gvmodel)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + NumCompaniesWorked + YearsAtCompany + 
##     OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9913.0 -2282.8  -478.7  1590.7 13260.2 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2695.11     491.47  -5.484  4.9e-08 ***
## Age                  167.94      11.90  14.119  < 2e-16 ***
## NumCompaniesWorked   195.08      41.27   4.727  2.5e-06 ***
## YearsAtCompany       320.99      16.85  19.053  < 2e-16 ***
## OT_ind               193.53     218.71   0.885  0.37635    
## BT1                  381.18     319.00   1.195  0.23230    
## BT2                  110.22     372.45   0.296  0.76733    
## Attr_Ind            -769.36     275.10  -2.797  0.00523 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3645 on 1462 degrees of freedom
## Multiple R-squared:  0.4034, Adjusted R-squared:  0.4005 
## F-statistic: 141.2 on 7 and 1462 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fitAttr) 
## 
##                       Value p-value                   Decision
## Global Stat        307.4495 0.00000 Assumptions NOT satisfied!
## Skewness           195.9303 0.00000 Assumptions NOT satisfied!
## Kurtosis           106.3839 0.00000 Assumptions NOT satisfied!
## Link Function        4.5831 0.03229 Assumptions NOT satisfied!
## Heteroscedasticity   0.5522 0.45741    Assumptions acceptable.
library(tidyr)  # data tidying (e.g., spread)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
## The following object is masked from 'package:pastecs':
## 
##     extract
## The following object is masked from 'package:magrittr':
## 
##     extract
# explore status/terminations by Department
#status_count <- with(emp, table(Department, Attrition))
#status_count <- spread(data.frame(status_count), Attrition, Freq)
#status_count$previous_active <- shift(status_count$ACTIVE, 1L, type = "lag")
#status_count$percent_terminated <- 100*status_count$Yes / status_count$previous_active
#status_count



# chart data in questions
#ggplot() + geom_bar(aes(y = ..count..,x = STATUS_YEAR, fill = termreason_desc), data=terms, position = position_stack()) +
#  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

# "Age","BusinessTravel","Department","DistanceFromHome","Education","EducationField", "Gender", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked",  "Overtime", "PercentSalaryHike","PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager","Attrition"

# Partition the data into training and test sets
term_vars <- c("Age","Education","MonthlyIncome","BusinessTravel","Department","DistanceFromHome","EducationField", "Gender","JobInvolvement","JobLevel", "JobSatisfaction", "MaritalStatus", "MonthlyIncome","NumCompaniesWorked", "OverTime","PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager","Attrition")
emp_term_train <- subset(emp, Education < 4)
emp_term_test <- subset(emp, Education >= 4)
set.seed(99)  # set a pre-defined value for the random seed so that results are repeatable
# Decision tree model
rpart_model <- rpart(Attrition ~.,
                     data = emp_term_train[term_vars],
                     method = 'class',
                     parms = list(split='information'),
                     control = rpart.control(usesurrogate = 0,
                                             maxsurrogate = 0))
# Plot the decision tree
rpart.plot(rpart_model, roundint = FALSE, type = 3)

##

# plot terminated & active by age
emp$resigned <- as.factor(emp$Attrition)
summary(emp$resigned)
##   No  Yes 
## 1233  237
featurePlot(x=emp$Age, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Age (years)", ""))

##

# plot terminated & active by MonthlyIncome

featurePlot(x=emp$MonthlyIncome, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Monthly Income", ""))

##

featurePlot(x=emp$YearsAtCompany, y=emp$resigned, plot="density",
      auto.key = list(columns = 2), labels = c("Monthly Income", ""))

Summary statistics for the ABV variable.

# Subset the data 
emp_cat <- data.frame(emp$Age,emp$Education,emp$MonthlyIncome,emp$DistanceFromHome,emp$JobInvolvement,emp$JobLevel, emp$JobSatisfaction, emp$NumCompaniesWorked, emp$PercentSalaryHike,emp$PerformanceRating, emp$StockOptionLevel, emp$TotalWorkingYears,emp$TrainingTimesLastYear, emp$WorkLifeBalance,emp$RelationshipSatisfaction,emp$YearsAtCompany,emp$YearsInCurrentRole,emp$resigned,emp$EmployeeNumber)
col_headings <- c("Age","Education","MonthlyIncome","DistanceFromHome","JobInvolvement","JobLevel", "JobSatisfaction", "NumCompaniesWorked", "PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole","resigned","EmployeeNumber")
names(emp_cat) <- col_headings
emp_train <- subset(emp_cat, Education < 4)
emp_test <- subset(emp_cat, Education >= 4)

emp_train_rose <- ROSE(resigned ~ ., data = emp_train, seed=125)$data
# Tables to show balanced dataset sample sizes
table(emp_train_rose$resigned)
## 
##  No Yes 
## 521 503
# Select variables (res_vars) for the model to predict 'resigned'
res_vars <- c("Age","Education","MonthlyIncome","DistanceFromHome","JobInvolvement","JobLevel", "JobSatisfaction", "NumCompaniesWorked", "PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole","resigned")
set.seed(222)
emp_res_rose_RF <- randomForest(resigned ~ .,
                           data = emp_train_rose[res_vars],
                           ntree=500, importance = TRUE,
                           na.action = na.omit)
varImpPlot(emp_res_rose_RF,type=1,
           main="Variable Importance (Accuracy)",
           sub = "Random Forest Model")

#var_importance <- importance(emp_res_rose_RF)
#emp_res_rose_RF  # view results & Confusion matrix
# generate predictions based on test data ("emp_test")
emp_res_rose_RF_pred <- predict(emp_res_rose_RF, newdata = emp_test)
confusionMatrix(data = emp_res_rose_RF_pred,
                reference = emp_test$resigned,
                positive = "Yes", mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  255  32
##        Yes 128  31
##                                           
##                Accuracy : 0.6413          
##                  95% CI : (0.5948, 0.6858)
##     No Information Rate : 0.8587          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0965          
##  Mcnemar's Test P-Value : 5.894e-14       
##                                           
##               Precision : 0.19497         
##                  Recall : 0.49206         
##                      F1 : 0.27928         
##              Prevalence : 0.14126         
##          Detection Rate : 0.06951         
##    Detection Prevalence : 0.35650         
##       Balanced Accuracy : 0.57893         
##                                           
##        'Positive' Class : Yes             
## 

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Calculate prediction probabilites of employees who will resign
emp_res_rose_RF_pred_probs <- predict(emp_res_rose_RF, emp_test, type="prob")
Employees_flight_risk <- as.data.frame(cbind(emp_test$EmployeeNumber,emp_res_rose_RF_pred_probs))
col_headings <- c("EmployeeNumber","No","Yes")
names(Employees_flight_risk) <- col_headings
Employees_flight_risk <- arrange(Employees_flight_risk, desc(Yes))
head(Employees_flight_risk)
##   EmployeeNumber    No   Yes
## 1            565 0.090 0.910
## 2            939 0.116 0.884
## 3           2010 0.188 0.812
## 4           1427 0.192 0.808
## 5            648 0.202 0.798
## 6           1842 0.202 0.798
tail(Employees_flight_risk)
##     EmployeeNumber    No   Yes
## 441           1630 0.732 0.268
## 442           1061 0.740 0.260
## 443           1932 0.744 0.256
## 444           1225 0.754 0.246
## 445            323 0.760 0.240
## 446           1116 0.810 0.190